OBJECTIVE : To predict the hospital re-admission probability of a DIABETIC patient by using appropriate Data Science techniques.
Below are questions need to be answered from the analysis :
About DATA : Data is retrieved from UCI (University of California, Irvine) repository
Why this DATA?
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
plt.style.use('seaborn')
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score
import statsmodels.api as stat_model
from sklearn.model_selection import GridSearchCV
pd.options.mode.chained_assignment = None
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import randint as sp_randint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.metrics import f1_score, precision_score, confusion_matrix,recall_score, precision_recall_curve, auc, precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer
from scipy.stats import skew
from pandas.plotting import scatter_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
data=pd.read_csv(r"challengetraining_data.csv")
print(data.shape)
data.head()
data.isnull().sum()
for col in data:
print('Column name : {}\n{}'.format(col, data[col].value_counts()/len(data)))
data=data.drop(['weight', 'payer_code', 'medical_specialty'], axis=1)
data=data.replace('?',np.nan)
data['gender']=data['gender'].replace('Unknown/Invalid',np.nan)
data=data.dropna()
print(data.shape)
data.head()
drugs = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone',
'metformin-pioglitazone', 'citoglipton', 'examide']
for col in drugs:
colname = str(col) + 'temp'
data[colname] = data[col].apply(lambda x: 0 if (x == 'No' or x == 'Steady') else 1)
data['numchange'] = 0
for col in drugs:
colname = str(col) + 'temp'
data['numchange'] = data['numchange'] + data[colname]
del data[colname]
for col in drugs:
data[col] = data[col].replace('No', 0)
data[col] = data[col].replace('Steady', 1)
data[col] = data[col].replace('Up', 1)
data[col] = data[col].replace('Down', 1)
data['nummed'] = 0
for col in drugs:
data['nummed'] = data['nummed'] + data[col]
print(data.shape)
data.head()
data['change'] = data['change'].replace('Ch', 1)
data['change'] = data['change'].replace('No', 0)
data['gender'] = data['gender'].replace('Male', 1)
data['gender'] = data['gender'].replace('Female', 0)
data['diabetesMed'] = data['diabetesMed'].replace('Yes', 1)
data['diabetesMed'] = data['diabetesMed'].replace('No', 0)
print(data.shape)
data.head()
data = data.drop_duplicates(subset= ['patient_nbr'], keep = 'first')
print(data.shape)
data.head()
data = data.drop(columns=['diag_1','diag_2','diag_3','encounter_id','patient_nbr'],axis=1)
print(data.shape)
data.head()
data['age'] = data.age.map({'[0-10)':10,'[10-20)':20, '[20-30)':30, '[30-40)':40, '[40-50)':50,'[50-60)':60, '[60-70)':70, '[70-80)':80, '[80-90)':90,'[90-100)':100})
data['age'] = data['age'].astype('int64')
data['readmitted'] = data['readmitted'].map({'Y':1,'N':0})
data['A1Cresult'] = data['A1Cresult'].map({'None':-99,'>7':1, '>8':1, 'Norm':0})
data['max_glu_serum'] = data['max_glu_serum'].map({'None':-99,'>300':1,'>200':1,'Norm':0 })
print(data.shape)
data.head()
data.describe().T
drugs_drop = ['repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone',
'metformin-pioglitazone', 'citoglipton', 'examide']
data.drop(columns=drugs_drop, inplace=True, axis=1)
print(data.shape)
data.head()
data.info()
race_dummy = pd.get_dummies(data['race'],prefix='race')
admission_type_dummy = pd.get_dummies(data['admission_type_id'],prefix='admission_type')
discharge_disposition_dummy = pd.get_dummies(data['discharge_disposition_id'],prefix='discharge')
admission_source_dummy = pd.get_dummies(data['admission_source_id'],prefix='admission_source')
data = pd.concat([data, race_dummy,admission_type_dummy,discharge_disposition_dummy,admission_source_dummy], axis = 1)
data.drop(columns=['race', 'admission_type_id','discharge_disposition_id', 'admission_source_id'], axis = 1, inplace=True)
print(data.shape)
data.head()
train=data
num_col = ['age', 'time_in_hospital', 'num_lab_procedures',
'num_procedures', 'num_medications', 'number_outpatient',
'number_emergency', 'number_inpatient', 'number_diagnoses']
statdataframe = pd.DataFrame()
statdataframe['numeric_column'] = num_col
skew_before = []
skew_after = []
kurt_before = []
kurt_after = []
standard_deviation_before = []
standard_deviation_after = []
log_transform_needed = []
log_type = []
for i in num_col:
skewval = train[i].skew()
skew_before.append(skewval)
kurtval = train[i].kurtosis()
kurt_before.append(kurtval)
sdval = train[i].std()
standard_deviation_before.append(sdval)
if (abs(skewval) >2) & (abs(kurtval) >2):
log_transform_needed.append('Yes')
if len(train[train[i] == 0])/len(train) <=0.02:
log_type.append('log')
skewvalnew = np.log(pd.DataFrame(train[train_data[i] > 0])[i]).skew()
skew_after.append(skewvalnew)
kurtvalnew = np.log(pd.DataFrame(train[train_data[i] > 0])[i]).kurtosis()
kurt_after.append(kurtvalnew)
sdvalnew = np.log(pd.DataFrame(train[train_data[i] > 0])[i]).std()
standard_deviation_after.append(sdvalnew)
else:
log_type.append('log1p')
skewvalnew = np.log1p(pd.DataFrame(train[train[i] >= 0])[i]).skew()
skew_after.append(skewvalnew)
kurtvalnew = np.log1p(pd.DataFrame(train[train[i] >= 0])[i]).kurtosis()
kurt_after.append(kurtvalnew)
sdvalnew = np.log1p(pd.DataFrame(train[train[i] >= 0])[i]).std()
standard_deviation_after.append(sdvalnew)
else:
log_type.append('NA')
log_transform_needed.append('No')
skew_after.append(skewval)
kurt_after.append(kurtval)
standard_deviation_after.append(sdval)
statdataframe['skew_before'] = skew_before
statdataframe['kurtosis_before'] = kurt_before
statdataframe['standard_deviation_before'] = standard_deviation_before
statdataframe['log_transform_needed'] = log_transform_needed
statdataframe['log_type'] = log_type
statdataframe['skew_after'] = skew_after
statdataframe['kurtosis_after'] = kurt_after
statdataframe['standard_deviation_after'] = standard_deviation_after
statdataframe
for i in range(len(statdataframe)):
if statdataframe['log_transform_needed'][i] == 'Yes':
colname = str(statdataframe['numeric_column'][i])
if statdataframe['log_type'][i] == 'log':
train = train[train[colname] > 0]
train[colname + "_log"] = np.log(train[colname])
elif statdataframe['log_type'][i] == 'log1p':
train = train[train[colname] >= 0]
train[colname + "_log1p"] = np.log1p(train[colname])
train = train.drop(['number_outpatient', 'number_inpatient', 'number_emergency'], axis = 1)
print(train.shape)
train.head()
num_cols = ['age', 'time_in_hospital', 'num_lab_procedures',
'num_procedures', 'num_medications', 'number_diagnoses']
train = train[(np.abs(sp.stats.zscore(train[num_cols])) < 3).all(axis=1)]
print(train.shape)
train.head()
attributes=num_cols
scatter_matrix(train[attributes], figsize = (20,15), c = train.readmitted, alpha = 0.8, cmap="Reds", marker = '+')
plt.show()
plt.figure(figsize=(15,10))
sns.heatmap(train[attributes].corr(), annot=True, cmap="Reds")
plt.show()
sns.pairplot(train, hue = 'readmitted', vars = num_cols, palette="Reds", markers="+")
plt.figure(figsize=(15,10))
sns.boxplot(data = train[num_cols], palette="Reds")
sns.countplot(train['readmitted'], label = "Count", palette="Reds")
train_input=train.drop('readmitted',axis=1)
train_output=train['readmitted']
print(train_input.shape)
print(train_output.shape)
print('Original dataset shape {}'.format(Counter(train_output)))
sm = SMOTE(random_state=20)
train_input_new, train_output_new = sm.fit_sample(train_input, train_output)
print('New dataset shape {}'.format(Counter(train_output_new)))
sns.countplot(train_output_new, label = "Count", palette="Reds")
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(train_input_new,train_output_new,test_size=0.2,random_state=2)
scaler=MinMaxScaler()
X_train=scaler.fit_transform(X_train_unscaled)
X_test=scaler.transform(X_test_unscaled)
print(X_train)
print(X_test)
1) Logistic Regression :
logit = LogisticRegression()
logit.fit(X_train,y_train)
param_grid = {'penalty':['l1', 'l2']}
grid_search = GridSearchCV(logit , param_grid, cv = 5 , return_train_score=True)
grid_search.fit(X_train, y_train)
grid_search.best_params_
results = pd.DataFrame(index=None, columns=['model','f1_score_train','f1_score_test','train_precision_score',
'test_precision_score','train_recall_score','test_recall_score'])
# Fitting the model on best parameters and priting the results
lreg_clf = LogisticRegression()
lreg_clf = LogisticRegression(penalty = 'l1')
lreg_clf.fit(X_train,y_train)
y_lreg_clf = lreg_clf.predict(X_test)
f1_score_train=f1_score(y_train, lreg_clf.predict(X_train))
f1_score_test=f1_score(y_test, lreg_clf.predict(X_test))
train_precision_score=precision_score(y_train,lreg_clf.predict(X_train))
test_precision_score=precision_score(y_test,lreg_clf.predict(X_test))
train_recall_score=recall_score(y_train,lreg_clf.predict(X_train))
test_recall_score=recall_score(y_test,lreg_clf.predict(X_test))
results = results.append(pd.Series({'model':'Logistic Regression','f1_score_train':f1_score_train,'f1_score_test':f1_score_test,
'train_precision_score':train_precision_score,'train_recall_score':train_recall_score,
'test_recall_score':test_recall_score,'test_precision_score':test_precision_score})
,ignore_index=True )
results
y_pred=lreg_clf.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm, annot=True, fmt='g', cmap="Reds")
plt.xlabel('Predicted')
plt.ylabel('Actual')
2) Decision Tree :
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=10)
param_grid = {'max_depth': [5,6,7, 8,10,12,15, 20, 50, 100]}
grid_search = GridSearchCV(dt_clf, param_grid, cv = 5, return_train_score=True)
grid_search.fit(X_train, y_train)
grid_search.best_params_
dt_clf = DecisionTreeClassifier(max_depth = 12)
dt_clf.fit(X_train,y_train)
y_dt_clf = dt_clf.predict(X_test)
train_precision_score=precision_score(y_train,dt_clf.predict(X_train))
test_precision_score=precision_score(y_test,dt_clf.predict(X_test))
f1_score_train=f1_score(y_train, dt_clf.predict(X_train))
f1_score_test=f1_score(y_test, dt_clf.predict(X_test))
train_recall_score=recall_score(y_train,dt_clf.predict(X_train))
test_recall_score=recall_score(y_test,dt_clf.predict(X_test))
results = results.append(pd.Series({'model':'Decision Tree','f1_score_train':f1_score_train,'f1_score_test':f1_score_test,
'train_precision_score':train_precision_score,
'train_recall_score':train_recall_score,
'test_recall_score':test_recall_score,'test_precision_score':test_precision_score})
,ignore_index=True )
results
y_pred=dt_clf.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm, annot=True, fmt='g', cmap="Reds")
plt.xlabel('Predicted')
plt.ylabel('Actual')
3) Random Forest :
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
param_grid = {"max_depth": [3, 5, 6,8],
"max_features": sp_randint(1, 25),
"min_samples_split": sp_randint(2, 30),
"min_samples_leaf": sp_randint(1, 20),
"bootstrap": [True, False]}
random_search = RandomizedSearchCV(RandomForestClassifier(n_estimators=1000), param_distributions=param_grid,
n_iter=20, random_state=0,n_jobs=-1, return_train_score=True)
random_search.fit(X_train, y_train)
random_search.best_params_
rf_clf = RandomForestClassifier(bootstrap=False,max_depth=8,max_features=22,min_samples_leaf=1,min_samples_split=5)
rf_clf.fit(X_train,y_train)
y_rf_clf = rf_clf.predict(X_test)
train_precision_score=precision_score(y_train,rf_clf.predict(X_train))
test_precision_score=precision_score(y_test,rf_clf.predict(X_test))
f1_score_train=f1_score(y_train, rf_clf.predict(X_train))
f1_score_test=f1_score(y_test, rf_clf.predict(X_test))
train_recall_score=recall_score(y_train,rf_clf.predict(X_train))
test_recall_score=recall_score(y_test,rf_clf.predict(X_test))
results = results.append(pd.Series({'model':'Random Forest','f1_score_train':f1_score_train,'f1_score_test':f1_score_test,
'train_precision_score':train_precision_score,'train_recall_score':train_recall_score,
'test_recall_score':test_recall_score,'test_precision_score':test_precision_score})
,ignore_index=True )
results
y_pred=rf_clf.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm, annot=True, fmt='g', cmap="Reds")
plt.xlabel('Predicted')
plt.ylabel('Actual')
cols = ['model','f1_score_train','f1_score_test']
results[cols].set_index('model').plot(kind = 'bar', figsize=(15,8), cmap='Reds')
plt.title('Train and Test f1_score')
import pickle
pickle.dump(scaler, open('tranform.pkl','wb'))
pickle.dump(rf_clf, open('model.pkl','wb'))
X_test=scaler.transform(X_test_unscaled[:1])
predictions=rf_clf.predict(X_test)
print("Predicted Result : ",predictions)
predictions = rf_clf.predict_proba(X_test)
print("Predicted Result probability : ",predictions)
from IPython.display import Image
Image(filename=r'interface.png',width=1000, height=60)
import plotly.graph_objs as go
import plotly.offline as py
features = train_input.columns.values
x, y = (list(x) for x in zip(*sorted(zip(rf_clf.feature_importances_, features),
reverse = False)))
trace2 = go.Bar(
x=x ,
y=y,
marker=dict(
color=x,
colorscale = 'Viridis',
reversescale = True
),
name='Random Forest Feature importance',
orientation='h',
)
layout = dict(
title='Barplot of Feature importances',
width = 900, height = 2000,
yaxis=dict(
showgrid=False,
showline=False,
showticklabels=True,
# domain=[0, 0.85],
),
margin=dict(
l=300,
),
)
fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.plot(fig1, filename='plots')
from IPython.display import Image
Image(filename=r'newplot.png',width=500, height=30)
INSIGHTS for Readmission :
- 1) For every 15 years up in age, increase 23% odds of readmission
- 2) Discharge type, discharge to home is the most prominent factor in classifying readmission of diabetic patient
- 3) Out of 23 medicine, we found that use of Insulin and Metafromin, increase the chance of readmission
- 4) Race Caucasian are highly-likely to get readmitted in hospital
- 5) Diabetic Male has more odds compared to female for readmission
</b>
from IPython.display import Image
Image(filename=r'insights.png',width=1000, height=60)
IMPROVEMENTS in project :
- 1) Using more features such as diag_1, diag_2, diag_3 to build the machine learning model
- 2) Adding more prominent features on product interface as input
- 3) Available dataset was from duration 1999-2008, getting more recent data to build the model
- 4) Since the dataset was highly imbalanced, we restricted balancing of dataset on oversampling technique. We can use undersampling technique to balance dataset and improve the model
- 5) Employing more machine learning algorithm to build model and check betterment, we had used only 3 algorithm Logistic Regression, Decision Tree and Random Forest
</b>
from IPython.display import Image
Image(filename=r'Improvements.png',width=1000, height=60)